import pandas as pd # To read the data set
import numpy as np # Importing numpy library
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt # Necessary library for plotting graphs
%matplotlib inline
sns.set(color_codes = True)
from scipy.cluster.hierarchy import dendrogram, cophenet, linkage
from sklearn.cluster import KMeans # For KMeans cluster model building
from scipy.spatial.distance import cdist # Importing cdist functionality for elbow graph
from scipy.spatial.distance import pdist # Importing pdist functionality for dendrograms
from sklearn.cluster import AgglomerativeClustering # For Agg Clust model building
from mpl_toolkits.mplot3d import Axes3D # Importing graph library for 3D visualization
from sklearn.decomposition import PCA # Importing to run pca analysis on data
from sklearn.model_selection import KFold, cross_val_score # Importing kfold for cross validation
from sklearn.model_selection import GridSearchCV # Importing for hypertuning model
from sklearn import metrics # Importing metrics
from sklearn.model_selection import train_test_split # Splitting data into train and test set
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.svm import SVC # For SVM model building
from sklearn.preprocessing import StandardScaler # Importing to standardize the data
from sklearn.impute import SimpleImputer # Importing to fill in zero values in the data
from sklearn.preprocessing import LabelEncoder # Importing label encoder
# Running steps to check and prepare the data
df = pd.read_csv('vehicle-1.csv')
df.head()
df.tail()
df.shape
df.size
df.isnull().sum()
We can notice from the above result that we have null values we need to deal with, within the data frame which we will tackle later on.
df.count()
df.info()
df.dtypes
df.describe().transpose()
df.skew()
columns = list(df)
df[columns].hist(stacked = True, density = True, bins = 75, color = 'Orange', layout = (10,3), figsize=(18,55));
df['class'].value_counts()
From the above histogram we can infer, independent variables in the data frame are normally distributed.
dfcar = df[df['class'] == 'car']
dfbus = df[df['class'] == 'bus']
dfvan = df[df['class'] == 'van']
labelencoder = LabelEncoder()
df['class'] = labelencoder.fit_transform(df['class'])
df['class'].value_counts()
# Plotting a graph for visual analysis of compactness
plt.figure(figsize=(10,6))
sns.distplot(dfcar['compactness'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['compactness'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['compactness'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Compactness Distribution');
We can infer that the distribution of compactness for 'car' & 'bus' is more than 'van'
# Plotting a graph for visual analysis of circularity
plt.figure(figsize=(10,6))
sns.distplot(dfcar['circularity'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['circularity'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['circularity'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Circularity Distribution');
We can infer that the distribution of circularity for 'car' & 'bus' is more than 'van'. Here we can also notice 'bus' is almost normally distributed.
# Plotting a grpah for visual analysis of distance_circularity
plt.figure(figsize=(10,6))
sns.distplot(dfcar['distance_circularity'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['distance_circularity'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['distance_circularity'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Distance_Circularity Distribution');
We can infer that the distribution of distance_circularity for 'car' & 'bus' is more than 'van'
# Plotting a graph for visual analysis of radius_ratio
plt.figure(figsize=(10,6))
sns.distplot(dfcar['radius_ratio'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['radius_ratio'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['radius_ratio'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Radius_Ratio Distribution');
We can infer that the distribution of distance_circularity for 'car' & 'bus' is more than 'van'
# Plotting a graph for visual analysis of pr.axis_aspect_ratio
plt.figure(figsize=(10,6))
sns.distplot(dfcar['pr.axis_aspect_ratio'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['pr.axis_aspect_ratio'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['pr.axis_aspect_ratio'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('PR.Axis_Aspect_Ratio Distribution');
We can infer that the distribution of pr.axis_aspect_ratio for 'bus' is slightly higher, than compared to 'car' & 'van' which are almost identical. Both 'car' & 'van' are almost normally distributed.
# Plotting a graph for visual analysis of max.length_aspect_ratio
plt.figure(figsize=(10,6))
sns.distplot(dfcar['max.length_aspect_ratio'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['max.length_aspect_ratio'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['max.length_aspect_ratio'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Max.Length_Aspect_Ratio Distribution');
We can infer that the distribution of max.length_aspect_ratio for 'car' & 'van' is higher and almost identical, than compared to 'bus'
# Plotting a graph for visual analysis of scatter_ratio
plt.figure(figsize=(10,6))
sns.distplot(dfcar['scatter_ratio'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['scatter_ratio'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['scatter_ratio'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Scatter_Ratio Distribution');
We can infer that the distribution of scatter_ratio for 'bus' is the most followed by 'car and then 'van'
# Plotting a graph for visual analysis of elongatedness
plt.figure(figsize=(10,6))
sns.distplot(dfcar['elongatedness'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['elongatedness'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['elongatedness'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Elongatedness Distribution');
We can infer that the distribution of elongatedness for 'car' is more than that of 'bus' & 'van'
# Plotting a graph for visual analysis of pr.axis_rectangularity
plt.figure(figsize=(10,6))
sns.distplot(dfcar['pr.axis_rectangularity'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['pr.axis_rectangularity'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['pr.axis_rectangularity'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('PR.Axis_Rectangularity Distribution');
We can infer that the distribution of pr.axis_rectangularity for 'bus' is more than that of 'car' & 'van'
# Plotting a graph for visual analysis of max.length_rectangularity
plt.figure(figsize=(10,6))
sns.distplot(dfcar['max.length_rectangularity'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['max.length_rectangularity'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['max.length_rectangularity'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Max.Length_Rectangularity Distribution');
We can infer that the distribution of max.length_rectangularity for 'car' is more than that of 'bus' & 'van'. Here we also notice that 'bus' is almost normally distributed.
# Plotting a graph for visual analysis of scaled_variance
plt.figure(figsize=(10,6))
sns.distplot(dfcar['scaled_variance'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['scaled_variance'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['scaled_variance'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Scaled_Variance Distribution');
We can infer that the distribution of scaled_variance for 'bus' is more than that of 'car' & 'van'
# Plotting a graph for visual analysis of scaled_variance.1
plt.figure(figsize=(10,6))
sns.distplot(dfcar['scaled_variance.1'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['scaled_variance.1'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['scaled_variance.1'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Scaled_Variance.1 Distribution');
We can infer that the distribution of scaled_variance.1 for 'bus' is more than that of 'car' & 'van'
# Plotting a graph for visual analysis of scaled_radius_of_gyration
plt.figure(figsize=(10,6))
sns.distplot(dfcar['scaled_radius_of_gyration'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['scaled_radius_of_gyration'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['scaled_radius_of_gyration'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Scaled_Radius_Of_Gyration Distribution');
We can infer that the distribution of scaled_radius_of_gyration for 'bus' is more than that of 'car' & 'van'. Here we notice that 'bus' is almost normally distributed.
# Plotting a graph for visual analysis of scaled_radius_of_gyration.1
plt.figure(figsize=(10,6))
sns.distplot(dfcar['scaled_radius_of_gyration.1'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['scaled_radius_of_gyration.1'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['scaled_radius_of_gyration.1'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Scaled_Radius_Of_Gyration.1 Distribution');
We can infer that the distribution of scaled_radius_of_gyration.1 for 'bus' & 'van' is more and almost similar than that of 'car'.
# Plotting a graph for visual analysis of skewness_about
plt.figure(figsize=(10,6))
sns.distplot(dfcar['skewness_about'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['skewness_about'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['skewness_about'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Skewness_About Distribution');
We can infer that the distribution of skewness_about for 'car' & 'van' is more and almost similar than that of 'bus'.
# Plotting a graph for visual analysis of skewness_about.1
plt.figure(figsize=(10,6))
sns.distplot(dfcar['skewness_about.1'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['skewness_about.1'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['skewness_about.1'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Skewness_About.1 Distribution');
We can infer that the distribution of skewness_about.1 for 'car is more than that of 'bus' & 'van' which is almost similar.
# Plotting a graph for visual analysis of skewness_about.2
plt.figure(figsize=(10,6))
sns.distplot(dfcar['skewness_about.2'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['skewness_about.2'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['skewness_about.2'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Skewness_About.2 Distribution');
We can infer that the distribution of skewness_about.2 is almost similar for 'car', 'van' & 'bus'.
# Plotting a graph for visual analysis of hollows_ratio
plt.figure(figsize=(10,6))
sns.distplot(dfcar['hollows_ratio'], kde = True, color = 'b', label = 'Car')
sns.distplot(dfbus['hollows_ratio'], kde = True, color = 'r', label = 'Bus')
sns.distplot(dfvan['hollows_ratio'], kde = True, color = 'g', label = 'Van')
plt.legend(loc='best')
plt.title('Hollows_Ratio Distribution');
We can infer that the distribution of hollows_ratio is almost similar for 'car', 'van' & 'bus'.
# Visualizing a pair plot for the data
sns.pairplot(df, diag_kind='kde', hue = 'class');
Our Inference from the above pairplot:
df.corr()
# Visualizing through a graph the different correlations in the data
plt.figure(figsize=(22,17))
corr = df.corr()
sns.heatmap(corr, annot = True, cmap = 'Pastel1', vmin = -1, vmax = 1);
# Finding highly correlated features
corr_pos = corr.abs()
mask = (corr_pos < 0.70)
fig, ax = plt.subplots(figsize=[22,17])
sns.heatmap(corr, annot = True, center = 0, vmin = -1, vmax = 1, mask = mask, cmap = 'Pastel1');
Our Inference from the above heatmap :
dfpie = df.groupby('class')['class'].count()
dfpie.plot.pie(shadow = False, startangle = 45, autopct = '%.2f');
Our Class label of vehicle type, consists of three types (Van / Bus & Car). Among these Car's contribute to 51% of the data, Bus & Van collectively contribute 49% of data.
labelencoder = LabelEncoder()
df['class'] = labelencoder.fit_transform(df['class'])
df['class'].value_counts()
miss_values = df.columns[df.isnull().any()]
df[miss_values].isnull().sum()
df[df['circularity'].isnull()][miss_values]
# Row 105,118,266 has missing values in more than 1 column, dropping them
df.drop([105,118,266], inplace=True)
# Replacing the values with median value of the corresponding class
df.loc[5].loc['class'], df.loc[396].loc['class']
# Class belongs to bus
Median_circularity_bus = df['circularity'][df['class'] == 0].median()
Median_circularity_bus
# Replacing NaN with median values
df['circularity'].fillna(Median_circularity_bus, inplace=True)
# Double checking if missing values have been removed
df[df['circularity'].isnull()][miss_values]
df[df['distance_circularity'].isnull()][miss_values]
# Row 207 has missing values in more than 1 column, dropping it
df.drop(207, inplace=True)
# Replacing the values with median value of the corresponding class
df.loc[35].loc['class'], df.loc[319].loc['class']
# Class belongs to van & bus
Median_distance_circularity_van = df['distance_circularity'][df['class'] == 2].median()
Median_distance_circularity_bus = df['distance_circularity'][df['class'] == 0].median()
Median_distance_circularity_van, Median_distance_circularity_bus
# Replacing NaN with median values
df.loc[35] = df.loc[35].replace(np.nan, Median_distance_circularity_van)
df.loc[319] = df.loc[319].replace(np.nan, Median_distance_circularity_bus)
# Double checking if missing values have been removed
df.loc[[35,319]]
df[df['radius_ratio'].isnull()][miss_values]
Only radius ratio is having missing values, all the other columns do not have missing values. We will not drop any rather replace with median of corresponding class.
# Replacing the values with median value of the corresponding class
df.loc[[9,78,159,287,345,467]]['class']
# Class belongs to car, bus & van
Median_radius_ratio_car = df['radius_ratio'][df['class'] == 1].median()
Median_radius_ratio_bus = df['radius_ratio'][df['class'] == 0].median()
Median_radius_ratio_van = df['radius_ratio'][df['class'] == 2].median()
Median_radius_ratio_car, Median_radius_ratio_bus, Median_radius_ratio_van
# Replacing NaN with median values of car, for rows (9,159,467)
df.loc[[9,159,467]] = df.loc[[9,159,467]].replace(np.nan, Median_radius_ratio_car)
# Double checking if missing values have been removed
df.loc[[9,159,467]]
# Replacing NaN with median value of bus, for rows (78,345)
df.loc[[78,345]] = df.loc[[78,345]].replace(np.nan, Median_radius_ratio_bus)
# Double checking if missing values have been removed
df.loc[[78,345]]
# Replacing NaN with median values of van, for row (287)
df.loc[287] = df.loc[287].replace(np.nan, Median_radius_ratio_van)
# Double checking if missing values have been removed
df.loc[[287]]
df[df['pr.axis_aspect_ratio'].isnull()][miss_values]
There are 2 rows with missing values. One row has missing value in one more column in addition to pr.axis_aspect_ratio.
We will drop that row but treat the missing value in pr.axis_aspect_ratio with median of corresponding class
# Row 222 has missing values in more than 1 column, dropping it
df.drop(222, inplace=True)
# Replacing the values with median value of the corresponding class
df.loc[19].loc['class']
# Class belongs to car
Median_pr_axis_aspect_ratio_car = df['pr.axis_aspect_ratio'][df['class'] == 1].median()
Median_pr_axis_aspect_ratio_car
# Replacing NaN with median values
df.loc[19] = df.loc[19].replace(np.nan, Median_pr_axis_aspect_ratio_car)
# Double checking if missing values have been removed
df[df['pr.axis_aspect_ratio'].isnull()][miss_values]
df[df['scatter_ratio'].isnull()][miss_values]
Only one row and 2 columns have missing value in that row including scatter_ratio hence we will drop this row.
# Row 249 has missing values in more than 1 column, dropping it
df.drop(249, inplace = True)
df[df['elongatedness'].isnull()][miss_values]
We will treat the missing value in elongatedness with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[215].loc['class']
# Class belongs to car
Median_elongatedness_car = df['elongatedness'][df['class'] == 1].median()
Median_elongatedness_car
# Replacing NaN with median values
df.loc[215] = df.loc[215].replace(np.nan, Median_elongatedness_car)
# Double checking if missing values have been removed
df[df['elongatedness'].isnull()][miss_values]
df[df['pr.axis_rectangularity'].isnull()][miss_values]
We will treat the missing value in pr.axis_rectangularity with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[[70,237,273]]['class']
# Class belongs to car, bus & van
Median_pr_axis_rectangularity_car = df['pr.axis_rectangularity'][df['class'] == 1].median()
Median_pr_axis_rectangularity_bus = df['pr.axis_rectangularity'][df['class'] == 0].median()
Median_pr_axis_rectangularity_van = df['pr.axis_rectangularity'][df['class'] == 2].median()
Median_pr_axis_rectangularity_car, Median_pr_axis_rectangularity_bus, Median_pr_axis_rectangularity_van
# Replacing NaN with median value of car, for row (70)
df.loc[70] = df.loc[70].replace(np.nan, Median_pr_axis_rectangularity_car)
# Double checking if missing values have been removed
df[df['pr.axis_rectangularity'].isnull()][miss_values]
# Replacing NaN with median value of bus, for row (237)
df.loc[237] = df.loc[237].replace(np.nan, Median_pr_axis_rectangularity_bus)
# Double checking if missing values have been removed
df[df['pr.axis_rectangularity'].isnull()][miss_values]
# Replacing NaN with median value of van, for row (273)
df.loc[273] = df.loc[273].replace(np.nan, Median_pr_axis_rectangularity_van)
# Double checking if missing values have been removed
df[df['pr.axis_rectangularity'].isnull()][miss_values]
df[df['scaled_variance'].isnull()][miss_values]
We will treat the missing value in scaled_variance with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[372].loc['class'], df.loc[522].loc['class']
# Class belongs to car & van
Median_scaled_variance_car = df['scaled_variance'][df['class'] == 1].median()
Median_scaled_variance_van = df['scaled_variance'][df['class'] == 2].median()
Median_scaled_variance_car, Median_scaled_variance_van
# Replacing NaN with median value of car, for row (522)
df.loc[522] = df.loc[522].replace(np.nan, Median_scaled_variance_car)
# Double checking if missing values have been removed
df[df['scaled_variance'].isnull()][miss_values]
# Replacing NaN with median value of van, for row (372)
df.loc[372] = df.loc[372].replace(np.nan, Median_scaled_variance_van)
# Double checking if missing values have been removed
df[df['scaled_variance'].isnull()][miss_values]
df[df['scaled_variance.1'].isnull()][miss_values]
We will treat the missing value in scaled_variance.1 with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[308].loc['class'], df.loc[496].loc['class']
# Class belongs to car
Median_scaled_variance1_car = df['scaled_variance.1'][df['class'] == 1].median()
Median_scaled_variance1_car
# Replacing NaN with median values
df['scaled_variance.1'].fillna(Median_scaled_variance1_car, inplace = True)
# Double checking if missing values have been removed
df[df['scaled_variance.1'].isnull()][miss_values]
df[df['scaled_radius_of_gyration'].isnull()][miss_values]
There are no missing values in scaled_radius_of_gyration
df[df['scaled_radius_of_gyration.1'].isnull()][miss_values]
# Row 66 has missing values in more than 1 column, dropping it
df.drop(66, inplace = True)
# Replacing the values with median value of the corresponding class
df.loc[[77, 192, 329]]['class']
# Class belongs to car
Median_scaled_radius_of_gyration1_car = df['scaled_radius_of_gyration.1'][df['class'] == 1].median()
Median_scaled_radius_of_gyration1_car
# Replacing NaN with median values
df['scaled_radius_of_gyration.1'].fillna(Median_scaled_radius_of_gyration1_car, inplace = True)
# Double checking if missing values have been removed
df[df['scaled_radius_of_gyration.1'].isnull()][miss_values]
df[df['skewness_about'].isnull()][miss_values]
We will treat the missing value in skewness_about with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[[141,177,285]]['class']
# Class belongs to car & bus
Median_skewness_about_car = df['skewness_about'][df['class'] == 1].median()
Median_skewness_about_bus = df['skewness_about'][df['class'] == 0].median()
Median_skewness_about_car, Median_skewness_about_bus
# Replacing NaN with median value of bus, for row (141,177)
df.loc[[141,177]] = df.loc[[141,177]].replace(np.nan, Median_skewness_about_bus)
# Double checking if missing values have been removed
df[df['skewness_about'].isnull()][miss_values]
# Replacing NaN with median value of car, for row (285)
df.loc[285] = df.loc[285].replace(np.nan, Median_skewness_about_car)
# Double checking if missing values have been removed
df[df['skewness_about'].isnull()][miss_values]
df[df['skewness_about.1'].isnull()][miss_values]
There are no missing values in skewness_about.1
df[df['skewness_about.2'].isnull()][miss_values]
We will treat the missing value in skewness_about.2 with median of corresponding class
# Replacing the values with median value of the corresponding class
df.loc[419].loc['class']
# Class belongs to car
Median_skewness_about2_car = df['skewness_about.2'][df['class'] == 1].median()
Median_skewness_about2_car
# Replacing NaN with median values
df['skewness_about.2'].fillna(Median_skewness_about2_car, inplace = True)
# Double checking if missing values have been removed
df[df['skewness_about.2'].isnull()][miss_values]
def outliers_transform_with_replace_mean(base_dataset):
num_features=[col for col in base_dataset.select_dtypes(np.number).columns ]
print("Outliers in Dataset before Treatment")
print("====================================")
for i,cols in enumerate(num_features,start=1):
x = base_dataset[cols]
qr3, qr1=np.percentile(x, [75,25])
iqr=qr3-qr1
utv=qr3+(1.5*(iqr))
ltv=qr1-(1.5*(iqr))
count=(base_dataset[base_dataset[cols]>utv][cols].count())+(base_dataset[base_dataset[cols]<ltv][cols].count())
print("Column ",cols,"\t has ",count," outliers")
for i,cols in enumerate(num_features,start=1):
x = base_dataset[cols]
qr3, qr1=np.percentile(x, [75,25])
iqr=qr3-qr1
utv=qr3+(1.5*(iqr))
ltv=qr1-(1.5*(iqr))
y=[]
for p in x:
if p <ltv or p>utv:
y.append(np.mean(x))
else:
y.append(p)
base_dataset[cols]=y
print("\nOutliers in Dataset after Treatment")
print("====================================")
for i,cols in enumerate(num_features,start=1):
x = base_dataset[cols]
qr3, qr1=np.percentile(x, [75,25])
iqr=qr3-qr1
utv=qr3+(1.5*(iqr))
ltv=qr1-(1.5*(iqr))
count=(base_dataset[base_dataset[cols]>utv][cols].count())+(base_dataset[base_dataset[cols]<ltv][cols].count())
print("Column ",cols,"\t has ",count," outliers")
outliers_transform_with_replace_mean(df)
So by now we analyze each column and find that there are outliers in some columns. Our next step is to know whether these outliers are natural or artificial. If natural then we have to do nothing but if these outliers are artificial, we need to tackle them.
After seeing the max values of above outliers columns, it looks like outliers in above columns are natural not a typing error mistake or artificial.
Note: It is entirely my assumption on how i have wished to interpret it, as there is no way to prove whether these outliers are natural or artificial. We know that mostly algorithms are affected by outliers and outliers may affect our model buidling activity. When we build our SVM on the above data it is affected by outliers hence it is better to drop those outliers, like we have done above.
df['class'].value_counts()
# Visualizing new value counts of class field
sns.countplot(df['class'])
plt.show()
From above we can see that cars are most followed by bus and then vans.
# Train & Test split
x = df.drop('class', axis=1) # Dropping field from independent variables
y = df[['class']]
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=1, test_size=0.30)
# Checking the split data, into train and test set
print('{0:0.2f}% Data Training Set'.format((len(x_train)/len(df.index))*100))
print('{0:0.2f}% Data Testing Set'. format((len(x_test)/len(df.index))*100))
x.head()
x_train.describe()
x_train.isna().sum()
svmlin = SVC(C = 7, gamma = 0.015, kernel = 'linear', random_state=1)
svmlin.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmlin.score(x_train, y_train) # Training data score
svmlin.score(x_test, y_test) # Test data score
ypred = svmlin.predict(x_test)
svmlin_acc = accuracy_score(y_test, ypred) # Getting accuracy score of model on test data
svmlin_conf = confusion_matrix(y_test, ypred) # Visualizing confusion matrix of the model
svmlin_class = classification_report(y_test, ypred) # Visualizing classification report of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmlin_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmlin_conf)
print()
print('The Classification Report of the model is :\n', svmlin_class)
# Plotting confusion matrix graphically
cm = metrics.confusion_matrix(y_test, ypred, labels=[0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm, annot=True, cmap = 'Pastel1', fmt = 'g');
svmrbf = SVC(C = 2, gamma = 0.001, kernel = 'rbf', random_state=1)
svmrbf.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmrbf.score(x_train, y_train) # Training data score
svmrbf.score(x_test, y_test) # Testing data score
ypre = svmrbf.predict(x_test)
svmrbf_acc = accuracy_score(y_test, ypre) # Getting accuracy score of model on test data
svmrbf_conf = confusion_matrix(y_test, ypre) # Visualizing confusion matrix of the model
svmrbf_class = classification_report(y_test, ypre) # Visualizing classification report of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmrbf_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmrbf_conf)
print()
print('The Classification Report of the model is :\n', svmrbf_class)
# Plotting confusion matrix graphically
cm1 = metrics.confusion_matrix(y_test, ypre, labels=[0,1,2])
df_cm1 = pd.DataFrame(cm1, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm1, annot=True, cmap = 'Pastel1', fmt = 'g');
svmpoly = SVC(C = 1, gamma = 0.020, kernel = 'poly', random_state = 1)
svmpoly.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmpoly.score(x_train, y_train) # Training data score
svmpoly.score(x_test, y_test) # Testing data score
ypr = svmpoly.predict(x_test)
svmpoly_acc = accuracy_score(y_test, ypr) # Getting accuracy score of model on test data
svmpoly_conf = confusion_matrix(y_test, ypr) # Visualizing confusion matrix of the model
svmpoly_class = classification_report(y_test, ypr) # Visualizing classification score of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmpoly_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmpoly_conf)
print()
print('The Classification Report of the model is :\n', svmpoly_class)
# Plotting confusion matrix graphically
cm2 = metrics.confusion_matrix(y_test, ypr, labels=[0,1,2])
df_cm2 = pd.DataFrame(cm2, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm2, annot=True, cmap = 'Pastel1', fmt = 'g');
kfold = KFold(n_splits=10, random_state=1, shuffle = True)
svm = SVC(kernel = 'linear') # Setting the model to perform k-fold on
res = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold) # Training model on raw data
print(res,'\n')
print('Kfold Accuracy Raw Data:{}'.format(res.mean()*100))
print('Kfold Score STD Raw Data:{}'.format(res.std()*100))
print()
print("Accuracy Raw Data: %0.2f (+/- %0.2f)" % (res.mean()*100, res.std()*100 * 2))
kfold = KFold(n_splits=10, random_state=1, shuffle = True)
svm = SVC(kernel = 'rbf') # Setting the model to perform k-fold on
result = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold) # Training model on raw data
print(result,'\n')
print('Kfold Accuracy Raw Data:{}'.format(result.mean()*100))
print('Kfold Score STD Raw Data:{}'.format(result.std()*100))
print()
print("Accuracy Raw Data: %0.2f (+/- %0.2f)" % (result.mean()*100, result.std()*100 * 2))
kfold = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'poly')
result1 = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold) # Training model on raw data
print(result1, '\n')
print('Kfold Accuracy Raw Data :{}'.format(result1.mean()*100))
print('Kfold Score STD Raw Data :{}'.format(result1.std()*100))
print()
print('Accuracy Raw Data: %0.2f (+/- %0.2f)' % (result1.mean()*100, result1.std()*100*2))
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_sd = sc.fit_transform(x) # Fitting training data on scaled object
x_train, x_test, y_train, y_test = train_test_split(x_sd, y, random_state=1, test_size=0.30)
# Checking the split data, into train and test set
print('{0:0.2f}% Data Training Set'.format((len(x_train)/len(df.index))*100))
print('{0:0.2f}% Data Testing Set'.format((len(x_test)/len(df.index))*100))
x_train.dtype
svmlin1 = SVC(C = 8, gamma = 0.025, kernel = 'linear', random_state=1)
svmlin1.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmlin1.score(x_train, y_train) # Training data score
svmlin1.score(x_test, y_test) # Testing data score
y_pred = svmlin1.predict(x_test)
svmlin1_acc = accuracy_score(y_test, y_pred) # Getting accuracy score of model on test data
svmlin1_conf = confusion_matrix(y_test, y_pred) # Visualizing confusion matrix of the model
svmlin1_class = classification_report(y_test, y_pred) # Visualizing classification report of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmlin1_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmlin1_conf)
print()
print('The Classification Report of the model is :\n', svmlin1_class)
# Plotting confusion matrix graphically
cm3 = metrics.confusion_matrix(y_test, y_pred, labels=[0,1,2])
df_cm3 = pd.DataFrame(cm3, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm3, annot=True, cmap = 'Pastel1', fmt = 'g');
svmrbf1 = SVC(C = 8, gamma = 0.035, kernel = 'rbf', random_state=1)
svmrbf1.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmrbf1.score(x_train, y_train) # Training data score
svmrbf1.score(x_test, y_test) # Testing data score
y_pre = svmrbf1.predict(x_test)
svmrbf1_acc = accuracy_score(y_test, y_pre) # Getting accuracy score of model on test data
svmrbf1_conf = confusion_matrix(y_test, y_pre) # Visualizing confusion matrix of the model
svmrbf1_class = classification_report(y_test, y_pre) # Visualizing classification report of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmrbf1_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmrbf1_conf)
print()
print('The Classification Report of the model is :\n', svmrbf1_class)
# Plotting confusion matrix graphically
cm4 = metrics.confusion_matrix(y_test, y_pre, labels=[0,1,2])
df_cm4 = pd.DataFrame(cm4, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm4, annot=True, cmap = 'Pastel1', fmt = 'g');
svmpoly1 = SVC(C = 200, gamma = 0.025, kernel = 'poly', random_state = 1)
svmpoly1.fit(x_train, y_train.values.ravel()) # Fitting model on training data
svmpoly1.score(x_train, y_train) # Training data score
svmpoly1.score(x_test, y_test) # Testing data score
y_pr = svmpoly1.predict(x_test)
svmpoly1_acc = accuracy_score(y_test, y_pr) # Getting accuracy score of model on test data
svmpoly1_conf = confusion_matrix(y_test, y_pr) # Visualizing confusion matrix of the model
svmpoly1_class = classification_report(y_test, y_pr) # Visualizing classification score of the model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmpoly1_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmpoly1_conf)
print()
print('The Classification Report of the model is :\n', svmpoly1_class)
# Plotting confusion matrix graphically
cm5 = metrics.confusion_matrix(y_test, y_pr, labels=[0,1,2])
df_cm5 = pd.DataFrame(cm5, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm5, annot=True, cmap = 'Pastel1', fmt = 'g');
kfold1 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'linear')
res1 = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold1) # Training model on scaled data
print(res1, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res1.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res1.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res1.mean()*100, res1.std()*100*2))
kfold2 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'rbf')
res2 = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold) # Training model on scaled data
print(res2, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res2.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res2.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res2.mean()*100, res2.std()*100*2))
kfold3 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'poly')
res3 = cross_val_score(estimator = svm, X = x_train, y = y_train.values.ravel(), cv = kfold) # Training model on scaled data
print(res3, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res3.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res3.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res3.mean()*100, res3.std()*100*2))
# Creating the covariance matrix
cov_matrix = np.cov(x_sd,rowvar=False)
print("cov_matrix shape:",cov_matrix.shape)
print()
print("Covariance_matrix",cov_matrix)
We can notice our covariance matrix has (18) independent features. Hence giving us a matrix (18 x 18)
# Find Eigen Vecs & Vals
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print()
print('Eigen Vals \n%s', eig_vals)
# The "cumulative variance explained" analysis
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)
print('Cummulative Variance Explained', cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained
plt.figure(figsize=(10,6))
plt.bar(range(1, eig_vals.size + 1), var_exp, align = 'center', alpha = 0.5, label = 'Individual Variance Explained')
plt.bar(range(1, eig_vals.size + 1), cum_var_exp, label = 'Cummulative Variance Explained')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.legend(loc = 'best')
plt.tight_layout()
# Find Eigen Vecs & Vals from the above covariance matrix
pca = PCA(n_components=18)
pca.fit(x_sd)
pca.explained_variance_ratio_
pca.explained_variance_
pca.components_
plt.bar(list(range(1,19)), pca.explained_variance_ratio_)
plt.xlabel('Eigen Vals / Components')
plt.ylabel('Variance Explained')
plt.show()
plt.step(list(range(1,19)), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Eigen Vals / Components')
plt.ylabel('Cummulative Variance Explained')
plt.show()
From above we can see that 8 dimension are able to explain 95% variance of data. so we will use first 8 principal components.
pca8 = PCA(n_components = 8) # Using first 8 components
pca8.fit(x_sd)
scaled_pca8 = pca8.transform(x_sd) # Transforming 18 dimensions into 8
scaled_pca8.shape
# Visual analysis of the 8 dimensions
sns.pairplot(pd.DataFrame(scaled_pca8));
pca_x_train, pca_x_test, pca_y_train, pca_y_test = train_test_split(scaled_pca8, y, test_size = 0.30, random_state = 1)
print("shape of pca_x_train",pca_X_train.shape)
print("shape of pca_y_train",pca_y_train.shape)
print("shape of pca_x_test",pca_X_test.shape)
print("shape of pca_y_test",pca_y_test.shape)
svmlin2 = SVC(C = 4, gamma = 0.025, kernel = 'linear', random_state=1)
svmlin2.fit(pca_x_train, pca_y_train.values.ravel()) # Fitting model on PCA training data
svmlin2.score(pca_x_train, pca_y_train) # PCA Training data score
svmlin2.score(pca_x_test, pca_y_test) # PCA Training data score
y_pred1 = svmlin2.predict(pca_x_test)
svmlin2_acc = accuracy_score(pca_y_test, y_pred1) # Getting accuracy score of model on PCA test data
svmlin2_conf = confusion_matrix(pca_y_test, y_pred1) # Visualizing confusion matrix of the PCA model
svmlin2_class = classification_report(pca_y_test, y_pred1) # Visualizing classification report of the PCA model
print('The PCA Accuracy Score of the model is {0:.2f}%'. format(svmlin2_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmlin2_conf)
print()
print('The Classification Report of the model is :\n', svmlin2_class)
# Plotting confusion matrix graphically
cm6 = metrics.confusion_matrix(pca_y_test, y_pred1, labels=[0,1,2])
df_cm6 = pd.DataFrame(cm6, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm6, annot=True, cmap = 'Pastel1', fmt = 'g');
svmrbf2 = SVC(C = 8, gamma = 0.035, kernel = 'rbf', random_state=1)
svmrbf2.fit(pca_x_train, pca_y_train.values.ravel()) # Fitting model on PCA training data
svmrbf2.score(pca_x_train, pca_y_train) # PCA Training data score
svmrbf2.score(pca_x_test, pca_y_test) # PCA Testing data score
y_pre1 = svmrbf2.predict(pca_x_test)
svmrbf2_acc = accuracy_score(pca_y_test, y_pre1) # Getting accuracy score of PCA model on test data
svmrbf2_conf = confusion_matrix(pca_y_test, y_pre1) # Visualizing confusion matrix of the PCA model
svmrbf2_class = classification_report(pca_y_test, y_pre1) # Visualizing classification report of the PCA model
# Plotting confusion matrix graphically
cm7 = metrics.confusion_matrix(pca_y_test, y_pre1, labels=[0,1,2])
df_cm7 = pd.DataFrame(cm7, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm7, annot=True, cmap = 'Pastel1', fmt = 'g');
svmpoly2 = SVC(C = 170, gamma = 0.025, kernel = 'poly', random_state = 1)
svmpoly2.fit(pca_x_train, pca_y_train.values.ravel()) # Fitting model on PCA training data
svmpoly2.score(pca_x_train, pca_y_train) # PCA Training data score
svmpoly2.score(pca_x_test, pca_y_test) # PCA Testing data score
y_pr1 = svmpoly2.predict(pca_x_test)
svmpoly2_acc = accuracy_score(pca_y_test, y_pr1) # Getting accuracy score of PCA model on test data
svmpoly2_conf = confusion_matrix(pca_y_test, y_pr1) # Visualizing confusion matrix of the PCA model
svmpoly2_class = classification_report(pca_y_test, y_pr1) # Visualizing classification score of the PCA model
print('The Accuracy Score of the model is {0:.2f}%'. format(svmpoly2_acc*100))
print()
print('The Confusion Matrix of the model is :\n', svmpoly2_conf)
print()
print('The Classification Report of the model is :\n', svmpoly2_class)
# Plotting confusion matrix graphically
cm8 = metrics.confusion_matrix(pca_y_test, y_pr1, labels=[0,1,2])
df_cm8 = pd.DataFrame(cm8, index = [i for i in ['0','1','2']],
columns = [i for i in ['Bus','Car','Van']])
plt.figure(figsize=(8,5))
sns.heatmap(df_cm8, annot=True, cmap = 'Pastel1', fmt = 'g');
kfold4 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'linear')
res4 = cross_val_score(estimator = svm, X = pca_x_train, y = pca_y_train.values.ravel(), cv = kfold4) # Training model on PCA scaled data
print(res4, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res4.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res4.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res4.mean()*100, res4.std()*100*2))
kfold5 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'rbf')
res5 = cross_val_score(estimator = svm, X = pca_x_train, y = pca_y_train.values.ravel(), cv = kfold5) # Training model on PCA scaled data
print(res5, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res5.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res5.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res5.mean()*100, res5.std()*100*2))
kfold6 = KFold(n_splits = 10, random_state = 1, shuffle = True)
svm = SVC(kernel = 'poly')
res6 = cross_val_score(estimator = svm, X = pca_x_train, y = pca_y_train.values.ravel(), cv = kfold6) # Training model on PCA scaled data
print(res6, '\n')
print('Kfold Accuracy Scaled Data :{}'.format(res6.mean()*100))
print('Kfold Score STD Scaled Data :{}'.format(res6.std()*100))
print()
print('Accuracy Scaled Data %0.2f (+/- %0.2f)' %(res6.mean()*100, res6.std()*100*2))
# Comparing all models
df_compare = pd.DataFrame({'Algorithm':['Linear SVM (Raw Data)', 'RBF SVM (Raw Data)', 'Poly SVM (Raw Data)', 'K-FOLD Linear (Raw Data)', 'K-FOLD RBF (Raw Data)', 'K-FOLD Poly (Raw Data)', 'Linear SVM (Scaled Data)', 'RBF SVM (Scaled Data)', 'Poly SVM (Scaled Data)', 'K-FOLD Linear (Scaled Data)', 'K-FOLD RBF (Scaled Data)', 'K-FOLD Poly (Scaled Data)', 'Linear SVM (PCA Data)', 'RBF SVM (PCA Data)', 'Poly SVM(PCA Data)', 'K-Fold Linear (PCA Data)', 'K-Fold RBF (PCA Data)', 'K-Fold Poly (PCA Data)'],
'Accuracy Score (%)':[svmlin_acc*100,svmrbf_acc*100,svmpoly_acc*100,res.mean()*100, result.mean()*100, result1.mean()*100, svmlin1_acc*100, svmrbf1_acc*100, svmpoly1_acc*100, res1.mean()*100, res2.mean()*100, res3.mean()*100, svmlin2_acc*100, svmrbf2_acc*100, svmpoly2_acc*100, res4.mean()*100, res5.mean()*100, res6.mean()*100]}) # Pulling out accuracy score of all models
print("Following table shows comparison of various models")
df_compare
We can observe from the above result that "RBF SVM (Scaled Data) Model" has:
-------- Whereas "Poly SVM (Scaled Data)" has :
From the above analysis both models are almost similar but we should go with "RBF SVM (Scaled Data) Model" as the ('Precison Score') & ('Recall Score') for (Car, Bus & Van) are higher which plays an important role in data analysis. Though the Poly SVM (Scaled Data) Model" has higher or similar scores on one or two other parameters.
With reference to the K-Fold Crossvalidation we can infer that the "K-Fold RBF (Scaled Data) Model" outperformes the other K-Fold Crossvalidation techniques with an "Accuracy Score" of (95.06%). Hence we choose this model in K-Fold Crossvalidation.